Dual CRISPR Screen Analysis

Construct Scaffold Trimming

Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)

Instructions

To run this notebook reproducibly, follow these steps:

  1. Click Kernel > Restart & Clear Output
  2. When prompted, click the red Restart & clear all outputs button
  3. Fill in the values for your analysis for each of the variables in the Input Parameters section
  4. Click Cell > Run All

Input Parameters


In [ ]:
g_num_processors = 3
g_fastqs_dir = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/raw/20160504_D00611_0275_AHMM2JBCXX'
g_trimmed_fastqs_dir = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/interim/20160504_D00611_0275_AHMM2JBCXX'
g_full_5p_r1 = 'TATATATCTTGTGGAAAGGACGAAACACCG'
g_full_5p_r2 = 'CCTTATTTTAACTTGCTATTTCTAGCTCTAAAAC'
g_full_3p_r1 = 'GTTTCAGAGCTATGCTGGAAACTGCATAGCAAGTTGAAATAAGGCTAGTCCGTTATCAACTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGTACTGAG'
g_full_3p_r2 = 'CAAACAAGGCTTTTCTCCAAGGGATATTTATAGTCTCAAAACACACAATTACTTTACAGTTAGGGTGAGTTTCCTTTTGTGCTGTTTTTTAAAATA'
g_code_location = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python'

CCBB Library Imports


In [ ]:
import sys
sys.path.append(g_code_location)

Automated Set-Up


In [ ]:
# %load -s describe_var_list /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/utilities/analysis_run_prefixes.py
def describe_var_list(input_var_name_list):
    description_list =  ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
    return "".join(description_list)

In [ ]:
from ccbbucsd.utilities.analysis_run_prefixes import check_or_set, get_run_prefix, get_timestamp
g_trimmed_fastqs_dir = check_or_set(g_trimmed_fastqs_dir, g_fastqs_dir)
print(describe_var_list(['g_trimmed_fastqs_dir']))

In [ ]:
from ccbbucsd.utilities.files_and_paths import verify_or_make_dir
verify_or_make_dir(g_trimmed_fastqs_dir)

Info Logging Pass-Through


In [ ]:
from ccbbucsd.utilities.notebook_logging import set_stdout_info_logger
set_stdout_info_logger()

Scaffold Trimming Functions


In [ ]:
# %load /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/scaffold_trim.py
# standard libraries
import enum

# third-party libraries
import cutadapt.scripts.cutadapt

# ccbb libraries
from ccbbucsd.utilities.files_and_paths import get_file_name_pieces, make_file_path

__author__ = 'Amanda Birmingham'
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "prototype"


class TrimType(enum.Enum):
    FIVE = "5"
    THREE = "3"
    FIVE_THREE = "53"


def get_trimmed_suffix(trimtype):
    return "_trimmed{0}.fastq".format(trimtype.value)


def trim_linked_scaffold(output_dir, fastq_fp, scaffold_seq_5p, scaffold_seq_3p, quiet=True):
    args = ["-a", "{0}...{1}".format(scaffold_seq_5p,scaffold_seq_3p)]
    return _run_cutadapt(output_dir, fastq_fp, TrimType.FIVE_THREE, args, quiet)


def trim_global_scaffold(output_dir, fastq_fp, scaffold_seq_5p=None, scaffold_seq_3p=None, quiet=True):
    curr_fastq_fp = fastq_fp

    if scaffold_seq_5p is not None:
        curr_fastq_fp = _run_cutadapt_global(output_dir, curr_fastq_fp, scaffold_seq_5p, True, quiet)

    if scaffold_seq_3p is not None:
        curr_fastq_fp = _run_cutadapt_global(output_dir, curr_fastq_fp, scaffold_seq_3p, False, quiet)

    return curr_fastq_fp


def _run_cutadapt_global(output_dir, input_fastq_fp, seq_to_trim, is_5p, quiet):
    end_switch = "-g"
    end_name = TrimType.FIVE
    if not is_5p:
        end_switch = "-a"
        end_name = TrimType.THREE

    args = [end_switch, seq_to_trim]
    return _run_cutadapt(output_dir, input_fastq_fp, end_name, args, quiet)


def _run_cutadapt(output_dir, input_fastq_fp, trim_name, partial_args, quiet):
    _, input_base, _ = get_file_name_pieces(input_fastq_fp)
    output_fastq_fp = make_file_path(output_dir, input_base, get_trimmed_suffix(trim_name))
    args = [x for x in partial_args]
    if quiet:
        args.append("--quiet")
    args.extend(["-o", output_fastq_fp, input_fastq_fp])
    cutadapt.scripts.cutadapt.main(args)
    return output_fastq_fp

In [ ]:
def trim_fw_and_rv_reads(output_dir, full_5p_r1, full_3p_r1, full_5p_r2, full_3p_r2, fw_fastq_fp, rv_fastq_fp):        
    trim_linked_scaffold(output_dir, fw_fastq_fp, full_5p_r1, full_3p_r1)
    trim_linked_scaffold(output_dir, rv_fastq_fp, full_5p_r2, full_3p_r2)

Gzipped FASTQ Filenames


In [ ]:
g_seq_file_ext_name = ".fastq"
g_gzip_ext_name = ".gz"

In [ ]:
from ccbbucsd.utilities.files_and_paths import summarize_filenames_for_prefix_and_suffix
print(summarize_filenames_for_prefix_and_suffix(g_fastqs_dir, "", 
                                                "{0}{1}".format(g_seq_file_ext_name, g_gzip_ext_name), 
                                                all_subdirs=True))

FASTQ Gunzip Execution


In [ ]:
from ccbbucsd.utilities.files_and_paths import gunzip_wildpath, move_to_dir_and_flatten

def unzip_and_flatten_seq_files(top_fastqs_dir, ext_name, gzip_ext_name, keep_gzs):
    # first, recursively unzip all fastq.gz files anywhere under the input dir
    gunzip_wildpath(top_fastqs_dir, ext_name + gzip_ext_name, keep_gzs, True)  # True = do recursive
    # now move all fastqs to top-level directory so don't have to work recursively in future
    move_to_dir_and_flatten(top_fastqs_dir, top_fastqs_dir, ext_name)

In [ ]:
# False = don't keep gzs as well as expanding, True = do keep them (True only works for gzip 1.6+)
unzip_and_flatten_seq_files(g_fastqs_dir, g_seq_file_ext_name, g_gzip_ext_name, False)

FASTQ Filenames


In [ ]:
print(summarize_filenames_for_prefix_and_suffix(g_fastqs_dir, "", g_seq_file_ext_name))

Scaffold Trim Execution


In [ ]:
from ccbbucsd.utilities.parallel_process_fastqs import parallel_process_paired_reads, concatenate_parallel_results
g_parallel_results = parallel_process_paired_reads(g_fastqs_dir, g_seq_file_ext_name, g_num_processors, 
                                                   trim_fw_and_rv_reads, [g_trimmed_fastqs_dir, g_full_5p_r1, 
                                                                          g_full_3p_r1, g_full_5p_r2, g_full_3p_r2])

In [ ]:
print(concatenate_parallel_results(g_parallel_results))

Trimmed FASTQ Filenames


In [ ]:
print(summarize_filenames_for_prefix_and_suffix(g_trimmed_fastqs_dir, "", get_trimmed_suffix(TrimType.FIVE_THREE)))